{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "from sklearn.linear_model import LinearRegression" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def lm(x, y, data, intercept=True):\n", " \"\"\"Returns the coefficients from regressing y on x.\n", " \n", " Inputs:\n", " - x: a list containing the names of the x variables\n", " - y: the name of the y variable\n", " - data: a Pandas data frame (the names in x and y must be columns in this data frame)\n", " - intercept: boolean indicating whether or not to include an intercept term\n", " \n", " Outputs: A Pandas series with the estimated coefficients, indexed by the x variable names.\n", " \"\"\"\n", " \n", " if intercept:\n", " beta = [0] * (len(x) + 1)\n", " names = [\"Intercept\"] + x\n", " else:\n", " beta = [0] * len(x)\n", " names = x\n", " \n", " return pd.Series(data=beta, index=names)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Some Data To Test Your Code" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "predictors = [\"symboling\", \"normalized-losses\", \"make\", \"fuel-type\",\n", " \"aspiration\", \"num-of-doors\", \"body-style\", \"drive-wheels\",\n", " \"engine-location\", \"wheel-base\", \"length\", \"width\",\n", " \"height\", \"curb-weight\", \"engine-type\", \"num-of-cylinders\",\n", " \"engine-size\", \"fuel-system\", \"bore\", \"stroke\",\n", " \"compression-ratio\", \"horsepower\", \"peak-rpm\", \"city-mpg\",\n", " \"highway-mpg\"]\n", "data = pd.read_csv(\"http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data\",\n", " header=None,\n", " names=predictors + [\"price\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The following code strips out missing values (represented by \"?\" in this data set) and converts columns to numeric types before fitting linear regression to the data." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "print(data.shape)\n", "\n", "for col in data.columns:\n", " if data[col].dtype == object:\n", " data = data[data[col] != \"?\"]\n", " try:\n", " data[col] = pd.to_numeric(data[col])\n", " except:\n", " pass\n", " \n", "print(data.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test 1: Quantitative Predictors Only\n", "\n", "Let's test out the `lm` function you just wrote on some quantitative predictors." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "lm([\"length\", \"width\", \"height\"], \"price\", data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check that your `lm` function produces the same results as scikit-learn." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "model = LinearRegression()\n", "model.fit(data[[\"length\", \"width\", \"height\"]], data[\"price\"])\n", "model.intercept_, model.coef_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test 2: Categorical Predictors\n", "\n", "Your `lm` function should also do the right thing for categorical variables automatically (i.e., it should expand categorical variables with $k$ levels into $k-1$ 0-1 variables automatically)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "lm(predictors, \"price\", data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check that your `lm` function produces the same results as scikit-learn." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "model = LinearRegression()\n", "data_expanded = pd.get_dummies(data[predictors], drop_first=True)\n", "model.fit(data_expanded, data[\"price\"])\n", "model.intercept_, model.coef_" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }